// LEC March 27, 2021
//
// STATE SIZE PROJECT
// Second Prep File




////////////////////////////////////////////////////////////////////////////////// 
// Create files which hold properties of AGs: AG rather than state perspective!
// CODE SO FAR NOT USED THAT MUCH BUT COULD BE EXPANDED
//////////////////////////////////////////////////////////////////////////////////

// First use the dataset generated midway from previous routine
cd $OUTDIR
use agdata, clear

// Sum up segment data for each AG
collapse (firstnm)ethnicname (max)nag agtf (sum)ethnicareacorr ethnicpopulationcorr, by (ethnicid year)
gen agarea = ethnicareacorr
sort ethnicid year

// Compute area and pop. weighted frac. indices 
bys year: egen sumagarea = sum(ethnicareacorr)
bys year: egen agtfaw = sum(ethnicareacorr*agtf)
replace agtfaw = agtfaw/sumagarea

bys year: egen sumagpop = sum(ethnicpopulationcorr)
bys year: egen agtfpw = sum(ethnicpopulationcorr*agtf)
replace agtfpw = agtfpw/sumagpop

gen lnagarea = log(agarea)

// Save the AG variables
// cd "/Users/larsc/polybox/Shared/NASTACdb/ethnicity-682/CShapes"
cd $OUTDIR
save agvars, replace

// Sum up all AG vars for entire system
collapse (mean)nag agtf agarea lnagarea (max)agtfaw agtfpw, by (year)
cd $OUTDIR
save agsystemvars, replace


////////////////////////////////////////////////////////////////////////////////// 
// Creates borderchage file which holds all "colorblind" change processes
// The goal is to create a dataset that only contains dyadic border changes
// listing the gaining state first (State A) followed by the losing state (State B)
//////////////////////////////////////////////////////////////////////////////////


cd $INDIRAGG

insheet using $DYADFILE, clear

// Create dyadic identifiers (directed and undirected)
gen idd = ida*1000 + idb
gen idd2 = idd
replace idd2 = idb * 1000 + ida if idb!=. & idb < ida

xtset idd year

// plug in zeroes in the gain/loss vars instead of missing values
replace gainaarea = 0 if gainaarea == .
replace gainaarea = 0 if year == $STARTYEAR
replace lossaarea = 0 if lossaarea == .
replace lossaarea = 0 if year == $STARTYEAR

replace gainbarea = 0 if gainbarea == .
replace gainbarea = 0 if year == $STARTYEAR
replace lossbarea = 0 if lossbarea == .
replace lossbarea = 0 if year == $STARTYEAR

// Define main var holding net dyadic gain
gen netdgain = gainaarea - lossaarea

// Now restrict the dataset by dropping all dyads without border change
drop if netdgain == .
drop if netdgain == 0

// Merge in state-level data, especially birth and death data
// Starting with State A
gen id = ida
sort id year
cd $OUTDIR
merge id year using statevars
drop _merge
gen idca = idc
gen namechangea = namechange
gen birtha = birth
gen deatha = death
gen areaa = area
gen populationa = population
gen capital_geoma = capital_geom
gen capitala = capital
drop id birth death population area population idc namechange capital capital_geom
drop if ida == .
drop if idb == .
drop if year == .

// Then merge State B
gen id = idb
sort id year
cd $OUTDIR
merge id year using statevars
drop _merge
gen idcb = idc
gen namechangeb = namechange
gen birthb = birth
gen deathb = death
gen capital_geomb = capital_geom
gen capitalb = capital
drop id birth death idc namechange capital capital_geom
drop if ida == .
drop if idb == .
drop if year == .
replace deathb = 0 if deathb == .


// Since observations after state death do not exist in statevar file,
// we have to identify "lagged" variables
// I KNOW THIS IS UGLY: I'M SURE THIS CAN BE HANDLED BETTER IN R OR SQL
gen id = idb
replace year = year - 1
sort id year
cd $OUTDIR
merge id year using statevars
drop _merge
gen lcapital_geomb = capital_geom
gen lcapitalb = capital
drop id capital capital_geom
drop if ida == .
drop if idb == .
drop if year == .
replace year = year + 1



// In case continuous IDs used (idc), make sure those changes are dropped that turn out
// to be no changes (e.g. if Austria-Hungary is inherited by Austria with same idc == 300
// then the death of the former and birth of the latter need to be deleted
// drop if idcb == . & idca != ida
drop if $IDCONT & (namechangea==1 & namechangeb==.)

// Now restrict the data to undirected dyads with State A always gianing and State B alwys losing
drop if netdgain < 0 & netdgain != .


gen samecapital = 0
// replace samecapital = idd if capitala == lcapitalb & capitala != "" & lcapitalb != "" if ! $IDCONT

bys ida year: egen nobirtha = max(samecapital)
bys idb year: egen nodeathb = max(samecapital)

replace birtha = 0 if nobirtha > 0
replace deathb = 0 if nodeathb > 0


gen bctype = .
replace bctype = 1 if birtha==0 & deathb==0 
replace bctype = 2 if birtha==0 & deathb==1 
replace bctype = 3 if birtha==1 & deathb==0 
replace bctype = 4 if birtha==1 & deathb==1 
replace bctype = . if samecapital > 0
replace netdgain = 0 if samecapital > 0

// We label these four options transfer, absorb, secession and collapse
// These are the key dummy variables that will be used below
gen transfer = 0
replace transfer = 1 if bctype==1
gen absorb = 0
replace absorb = 1 if bctype==2
gen secession = 0
replace secession = 1 if bctype==3
gen collapse = 0
replace collapse = 1 if bctype==4  
 
sort ida idb year
cd $OUTDIR
save borderchange, replace




///////////////////////////////////////////////////////////////////////////////////////////
// Identify ethnic border chnage by producing ethbc file
// This file holds both "colorblind" and ethnic border changes
// Input file holds all ethnic overlaps with border changes (each line corresponds to an AG)
///////////////////////////////////////////////////////////////////////////////////////////


cd $INDIRAGG
insheet using $ETHDYADFILE, clear


duplicates drop ida idb ethnicid, force

// First merge in the colorblind border changes that we just created
cd $OUTDIR 
sort ida idb year
merge ida idb year using borderchange  
drop _merge
drop if ida == .
drop if idb == .
drop if year == .

// Then merge in the state-level ethnic variables, first State A...
cd $OUTDIR
gen id = ida
sort id year
merge id year using ethnicvars
drop _merge
gen maxethida = maxethid
gen maxethnamea = maxethname
drop maxethid maxethname
drop id
drop if ida == .
drop if idb == .
drop if year == .

// ... and then State B
cd $OUTDIR
gen id = idb
sort id year
merge id year using ethnicvars
drop _merge
gen maxethidb = maxethid
gen maxethnameb = maxethname
drop id maxethid maxethname
drop if ida == .
drop if idb == .
drop if year == .

// Since observations after state death do not exist in ethnicvars file,
// we have to identify "lagged" variables
// I KNOW THIS IS UGLY: I'M SURE THIS CAN BE HANDLED BETTER IN R OR SQL
cd $OUTDIR
gen id = idb
replace year = year - 1
sort id year
merge id year using ethnicvars
drop _merge
gen lmaxethidb = maxethid
gen lmaxethnameb = maxethname
drop id maxethid maxethname
drop if ida == .
drop if idb == .
drop if year == .
replace year = year + 1


// Now compute ethic matches between group in the traded territory (ethnicid) and
// the largest groups in States A ad B respectively
gen maxethmatcha = 0
replace maxethmatcha = 1 if maxethida == ethnicid & ethnicid != .
gen maxethmatchb = 0
replace maxethmatchb = 1 if maxethidb == ethnicid & ethnicid != .
gen lmaxethmatchb = 0  // do this for the lagged match indicator for losing State B
replace lmaxethmatchb = 1 if lmaxethidb == ethnicid & ethnicid != .

// make sure the information of a possible match is available everywhere
// We use max because we are interested in match with ANY of the overlapping groups
bys ida idb year: egen ethmatcha = max(maxethmatcha)
bys ida idb year: egen ethmatchb = max(maxethmatchb)
bys ida idb year: egen lethmatchb = max(lmaxethmatchb)

// Now we compute a more demanding matching criterion based on the largest group
// in the traded territory
bys ida idb year: egen maxgainpop = max(gainaethnicpopulationcorr)
gen maxgainethid = 0
replace maxgainethid = ethnicid if maxgainpop==gainaethnicpopulationcorr
gen maxgainethname = ""  // NOT USED: name matching possible too
replace maxgainethname = ethnicname if maxgainpop==gainaethnicpopulationcorr

gen popethmatcha = 0
replace popethmatcha = 1 if maxgainethid == maxethida
gen popethmatchb = 0
replace popethmatchb = 1 if maxgainethid == maxethidb
// replace popethmatchb = 1 if maxgainethname == maxethnameb // name matching not used
gen lpopethmatchb = 0   // lagged population match variable for State B
replace lpopethmatchb = 1 if maxgainethid == lmaxethidb

// Now sum up all ethnic group observations for each A B dyad
collapse (firstnm)namea nameb maxgainethname (max)samecapital bctype birtha deathb netdgain ethnicid transfer absorb secession collapse /// 
maxethida maxethidb lmaxethidb maxethmatcha maxethmatchb lmaxethmatchb popethmatcha popethmatchb lpopethmatchb, by (ida idb year)

replace maxethidb = lmaxethidb if maxethidb == .

sort ida idb year

// Now we can finally compute the ethnic border change variables

// Irredentism is defined as a special case of transfer where the gaining state has kin in the traded territory
// This is a less stringent definition
gen irr = 0
replace irr = 1 if transfer==1 & maxethmatcha == 1
// Here is a more stringent option that requires that the majority of the traded area has to match with State A's largest group
// replace irr = 1 if transfer==1 & popethmatcha == 1 

// Ethnic absorbtion (or unification) is defined a special case of absorbtion where both states have the same largest group
gen ethabsorb = 0
replace ethabsorb = 1 if absorb==1 & (maxethida == lmaxethidb) & maxethida != . & lmaxethidb != .

// Ethnic secession is defined as a special case of secession 
// where the breakaway state's largest group is different from the rump state's largest group
gen ethsec = 0
replace ethsec = 1 if secession==1 & (maxethida != maxethidb) & maxethida != . & maxethidb != .
replace ethsec = 1 if secession==1 & ethnicid == .

// Ethnic collapse is defined as a special case of collapse 
// where the newborn state's largest group is different from the dying state's largest group 
gen ethcollapse = 0
replace ethcollapse = 1 if collapse==1 & (maxethida != lmaxethidb) & maxethida != . & lmaxethidb != .

// We also define a dummy to identify all cases of ethnic border change
gen ethchange = 0  
replace ethchange = 1 if irr==1
replace ethchange = 1 if ethabsorb==1
replace ethchange = 1 if ethsec==1
replace ethchange = 1 if ethcollapse==1

// For dyadic border computations below we separate netdgain into an ethnic and a non-ethnic component
gen netdgaineth = ethchange * netdgain
gen netdgainneth = (1-ethchange) * netdgain

replace samecapital = 0 if samecapital == .

cd $OUTDIR
save ethbc, replace

/////////////////////////

cd $OUTDIR
use ethbc, clear

bys ida year: egen sumbirth = sum(secession+collapse)
replace secession = secession/sumbirth
replace collapse = collapse/sumbirth
bys ida year: egen sumethbirth = sum(ethsec+ethcollapse)
replace ethsec = ethsec/sumethbirth
replace ethcollapse = ethcollapse/sumethbirth

collapse (firstnm)namea nameb (max)samecapital (sum)bctype netdgain netdgaineth netdgainneth transfer absorb secession collapse ///
(sum)irr ethabsorb ethsec ethcollapse, by (ida year)

replace samecapital = 0 if samecapital == .

sort ida year
save ethbca, replace

/////////////////////////

cd $OUTDIR
use ethbc, clear

bys idb year: egen sumdeath = sum(absorb+collapse)
replace absorb = absorb/sumdeath 
replace collapse = collapse/sumdeath
bys idb year: egen sumethdeath = sum(ethabsorb+ethcollapse)
replace ethabsorb = ethabsorb/sumethdeath 
replace ethcollapse = ethcollapse/sumethdeath

collapse (firstnm)namea nameb (max)samecapital (sum)bctype netdgain netdgaineth netdgainneth transfer absorb secession collapse ///
(sum)irr ethabsorb ethsec ethcollapse, by (idb year)

replace samecapital = 0 if samecapital == .

sort idb year
save ethbcb, replace




///////////////////////////////////////////////////////////////////////////////////////////
// FINAL STATE LEVEL ANALYSIS WITH UNIT PROC AND ETHNIC VARS
// This creates the analysis dataset with all inputs
///////////////////////////////////////////////////////////////////////////////////////////



cd $OUTDIR
use statevars, clear



duplicates drop id year, force
xtset id year

// Make sure state level gain variables do not hold missing values
replace gainarea = 0 if gainarea == .
replace lossarea = 0 if lossarea == .

// Main unlogged net gain variable
gen netgain = .
replace netgain = gainarea - lossarea if year>$STARTYEAR

// Dummy version
gen netgain1 = 0
replace netgain1 = 1 if netgain>10 & netgain!=.

// Net loss
gen netloss = .
replace netloss = lossarea - gainarea

// Dummy version
gen netloss1 = 0
replace netloss1 = 1 if netloss>10 & netloss!=.

// Main logged DV of net gains
gen lnetgain = 0
replace lnetgain = log(netgain) if netgain>0 & netgain!=.
replace lnetgain = -log(netloss) if netloss>0 & netloss!=.

xtset id year



// Merge in state-level variables defined at the beginning of the do-file
cd $OUTDIR
sort id year
merge 1:1 id year using statevars
drop _merge
drop if id == .
drop if year == .

// Merge in the ethnic state-level variables (e.g. ethnic frac. etc)
cd $OUTDIR
sort id year
merge id year using ethnicvars
drop _merge
drop if id == .
drop if year == .

// Merge in dyadic data for each side of the ethnic border chnage dyads,
// starting with State A ...
cd $OUTDIR 
gen ida = id
sort ida year
merge ida year using ethbca // borderida
drop _merge
gen samecapitala = samecapital
gen transfera = transfer
gen absorba = absorb
gen secessiona = secession
gen collapsea = collapse
gen irra = irr
gen ethabsorba = ethabsorb
gen ethseca = ethsec
gen ethcollapsea = ethcollapse
gen netdgaina = netdgain
gen netdgainetha = netdgaineth
gen netdgainnetha = netdgainneth
drop if id == .
drop if year == .
drop ida samecapital transfer absorb secession collapse irr ethabsorb ethsec ethcollapse netdgain netdgaineth netdgainneth
replace samecapitala = 0 if samecapitala == .
replace secessiona = 0 if secessiona == .
replace absorba = 0 if absorba == .
replace transfera = 0 if transfera == .
replace collapsea = 0 if collapsea == .
replace irra = 0 if irra == .
replace ethabsorba = 0 if ethabsorba == .
replace ethseca = 0 if ethseca == .
replace ethcollapsea = 0 if ethcollapsea == .
replace netdgaina = 0 if netdgaina == .
replace netdgainetha = 0 if netdgainetha == .
replace netdgainnetha = 0 if netdgainnetha == .

// ... then State B
cd $OUTDIR 
gen idb = id
sort idb year
merge idb year using ethbcb 
drop _merge
gen samecapitalb = samecapital
gen transferb = transfer
gen absorbb = absorb
gen secessionb = secession
gen collapseb = collapse
gen irrb = irr
gen ethabsorbb = ethabsorb
gen ethsecb = ethsec
gen ethcollapseb = ethcollapse
gen netdgainb = netdgain
gen netdgainethb = netdgaineth
gen netdgainnethb = netdgainneth
drop if id == .
drop if year == .
drop idb samecapital transfer absorb secession collapse irr ethabsorb ethsec ethcollapse netdgain netdgaineth netdgainneth
replace samecapitalb = 0 if samecapitalb == .
replace secessionb = 0 if secessionb == .
replace absorbb = 0 if absorbb == .
replace transferb = 0 if transferb == .
replace collapseb = 0 if collapseb == .
replace irrb = 0 if irrb == .
replace ethabsorbb = 0 if ethabsorbb == .
replace ethsecb = 0 if ethsecb == .
replace ethcollapseb = 0 if ethcollapseb == .
replace netdgainb = 0 if netdgainb == .
replace netdgainethb= 0 if netdgainethb == .
replace netdgainnethb = 0 if netdgainnethb == .

// Make sure that if ID cont option is on, we have to get rid of idc's that are not valid
if $IDCONT {
drop if idc == .
}

/*
gen samecapital = .
replace samecapital = samecapitala if samecapitala > 0
replace samecapital = samecapitalb if samecapitalb > 0 & samecapitala == 0

bys samecapital: egen maxid = max(id) 
bys samecapital: egen minid = min(id) 


gen otherid = 0
replace otherid = minid if id==maxid & maxid!=.
replace otherid = maxid if id==minid & minid!=.
replace otherid = 0 if samecapital==.


sort id year
bys id (year): gen otherid1 = sum(otherid)

drop idc


gen idc = id

replace idc = abs(otherid1) if otherid1 != 0 & otherid1 != .

*/


sort idc year
xtset idc year

gen netcgain = area - l.area
gen lnetcgain = 0
replace lnetcgain = log(1+netcgain) if netcgain>0 & netcgain!=.
replace lnetcgain = -log(1-netcgain) if netcgain<0 & netcgain!=.

gen netdcgain = netdgaina - netdgainb
gen netethdcgain = netdgainetha - netdgainethb
gen netnethdcgain = netdgainnetha - netdgainnethb

// Identify birth of states out of terra nullius without relevant dyadic border change 
gen terrnull = 0
replace terrnull = 1 if l.area==. & area!=. & secessiona==0 & collapsea==0 & year!=$STARTYEAR

// Logged area measure of state
gen lnarea = log(area)

// Number of states for each year
bys year: egen n=count(area)

// Variables used to compute distributional measures such as log-normality etc.
bys year: egen rank = rank(-area)
gen pr = rank/(n+1)
gen lnpr = log(pr)

// Does state's largest group have TEK?
gen tek = 0
replace tek = 1 if agn>1 & agn!=.
// logged pop of AG linked to largest group segment
gen lnagpop = log(agpop)

// Sum up all dyadic gains whether ethnic or non-ethnic
gen netdgaineth = netdgainetha - netdgainethb
gen netdgainneth = netdgainnetha - netdgainnethb

sort idc year
xtset idc year



// Compute flag for old states that existing at the beginning of sample
gen startyear = .
replace startyear = 1 if year==$STARTYEAR
bys idc: egen idc0 = max(idc*startyear)
gen oldstate = 0
replace oldstate = 1 if idc0==.

// Rudimentary terrtain variables
gen lnelevmean = log(elevationmean+1)
gen lnelevsd = log(elevationsd+1)


// Cumulative variables adding up border change
bysort idc (year): gen cumulsec = sum(secessionb)
bysort idc (year): gen cumulabsorb = sum(absorba)
bysort idc (year): gen cumultransf = sum(transfera)
bysort idc (year): gen cumultransfloss = sum(transferb)  

// lagged and logged versions
gen lcumulsec = l.cumulsec
gen lcumulabsorb = l.cumulabsorb
gen lcumultransf = l.cumultransf
gen lcumultransfloss = l.cumultransfloss


// cumulative counts for past ethnic and non-ethnic border change 
gen nethtransf = transfera-irra
gen nethabsorb = absorba-ethabsorba
gen nethsec = secessionb-ethsecb

bysort idc (year): gen cumulirr = sum(irra)
bysort idc (year): gen cumulnethtransf = sum(nethtransf)
bysort idc (year): gen cumulethabsorb = sum(ethabsorba)
bysort idc (year): gen cumulnethabsorb = sum(nethabsorb)
bysort idc (year): gen cumulethsec = sum(ethsecb)
bysort idc (year): gen cumulnethsec = sum(nethsec)

gen lcumulirr = l.cumulirr
gen lcumulnethtransf = l.cumulnethtransf
gen lcumulethabsorb = l.cumulethabsorb
gen lcumulnethabsorb = l.cumulnethabsorb
gen lcumulethsec = l.cumulethsec
gen lcumulnethsec = l.cumulnethsec

// Diffusion variables based on time window of past border change
/*
gen lsec = l.secessionb
gen lsec2 = lsec + l2.lsec
gen lsec3 = lsec2 + l3.lsec
gen lsec4 = lsec3 + l4.lsec
gen lsec5 = lsec4 + l5.lsec
gen lsec21 = 0
replace lsec21 = 1 if lsec2 > 0 & lsec2!=.
gen lsec51 = 0
replace lsec51 = 1 if lsec5 > 0 & lsec5!=.
gen lndep = log(ndep+1)


gen l5sec = l.secessionb
replace l5sec = 1 if l2.secessionb == 1
gen l2sec = l5sec
replace l5sec = 1 if l3.secessionb == 1 
replace l5sec = 1 if l4.secessionb == 1
replace l5sec = 1 if l5.secessionb == 1


gen l5ethsec = l.ethsecb
replace l5ethsec = 1 if l2.ethsecb == 1
gen l2ethsec = l5ethsec
replace l5ethsec = 1 if l3.ethsecb == 1 
replace l5ethsec = 1 if l4.ethsecb == 1
replace l5ethsec = 1 if l5.ethsecb == 1


gen l5nethsec = l.nethsec
replace l5nethsec = 1 if l2.nethsec == 1
gen l2nethsec = l5nethsec
replace l5nethsec = 1 if l3.nethsec == 1 
replace l5nethsec = 1 if l4.nethsec == 1
replace l5nethsec = 1 if l5.nethsec == 1


gen l5ethabs = l.ethabsorba
replace l5ethabs = 1 if l2.ethabsorba == 1
gen l2ethabs = l5ethabs
replace l5ethabs = 1 if l3.ethabsorba == 1 
replace l5ethabs = 1 if l4.ethabsorba == 1
replace l5ethabs = 1 if l5.ethabsorba == 1


gen l5nethabs = l.nethabsorb
replace l5nethabs = 1 if l2.nethabsorb == 1
gen l2nethabs = l5nethabs
replace l5nethabs = 1 if l3.nethabsorb == 1 
replace l5nethabs = 1 if l4.nethabsorb == 1
replace l5nethabs = 1 if l5.nethabsorb == 1
*/

// Possible to add world indices of secession here...


// Dummy for a border change event based on DV log net gain = 0
gen bcevent = 0
replace bcevent = 1 if lnetgain != 0 & lnetgain != .

// Counter for waiting time until border change events
btscs bcevent year idc, gen(y) nspline(3)
rename _spline1  spline1
rename _spline2  spline2
rename _spline3  spline3

// Age counter for waiting time until death (within sample)
btscs death year idc, gen(age) nspline(3)
rename _spline1  aspline1
rename _spline2  aspline2
rename _spline3  aspline3

// logged within-sampe age
gen lnage = log(age+1)



// Additional lagged variables for regressions
gen llnetgain = l.lnetgain
gen lef = l.ef
gen lagfrac = l.agfrac
gen llarea = l.lnarea
gen llage = l.lnage
gen llelevmean = l.lnelevmean
gen llelevsd = l.lnelevsd
gen lnpop = log(population)
gen llpop = l.lnpop




gen lmaxshare = l.maxshare
gen intdj = 1-maxshare
gen lintdj = 1-lmaxshare
// gen lminshare = 1-lmaxshare
gen lmaxagshare = l.maxagshare 
gen extdj = 1-maxagshare
gen lextdj = 1-lmaxagshare 




cd $OUTDIR
save analysis_data.dta, replace









